# !pip install geopandas
# !pip install h3
# !pip install folium
# !pip install osmnx
# !pip install geojson
import geopandas as gpd
import numpy as np
import json
import h3
import folium
import osmnx as ox
from shapely import wkt
from folium.plugins import HeatMap
from shapely.geometry import Polygon
from folium.plugins import MarkerCluster, HeatMap
import pandas as pd
from shapely.geometry import Polygon
from geojson import Feature, Point, FeatureCollection, Polygon
import plotly.express as px
from tqdm import tqdm
tqdm.pandas()
interests_df = pd.read_csv("stupino_interests.csv")
locs_df = pd.read_csv("stupino_locs.csv")
Построение диаграммы, сколько в каждом гексагоне карты отметок пользователя
H3_res = 9 # размер гексагона [1 .. 15] чем больше, тем меньше площадь
def geo_to_h3(row):
return h3.geo_to_h3(lat=row.lat, lng=row.lon, resolution=H3_res)
locs_df['h3_cell'] = locs_df.progress_apply(geo_to_h3, axis=1)
100%|██████████| 10880142/10880142 [04:25<00:00, 41016.59it/s]
locs_df_g = (locs_df
.groupby('h3_cell')
.id
.agg(list)
.to_frame("ids")
.reset_index())
# Let's count each points inside the hexagon
locs_df_g['count'] = (locs_df_g['ids']
.progress_apply(lambda ignition_ids: len(ignition_ids)))
100%|██████████| 1573/1573 [00:00<00:00, 136458.67it/s]
from shapely.geometry import Polygon
def add_geometry(row):
points = h3.h3_to_geo_boundary(row['h3_cell'], True)
return Polygon(points)
#Apply function into our dataframe
locs_df_g['geometry'] = (locs_df_g
.progress_apply(add_geometry, axis=1))
100%|██████████| 1573/1573 [00:00<00:00, 3812.77it/s]
def hexagons_dataframe_to_geojson(df_hex, hex_id_field, geometry_field, value_field, file_output=None):
list_features = []
for i, row in df_hex.iterrows():
feature = Feature(geometry=row[geometry_field],
id=row[hex_id_field],
properties={"value": row[value_field]})
list_features.append(feature)
feat_collection = FeatureCollection(list_features)
if file_output is not None:
with open(file_output, "w") as f:
json.dump(feat_collection, f)
else:
return feat_collection
geojson_obj = (hexagons_dataframe_to_geojson
(locs_df_g,
hex_id_field='h3_cell',
value_field='count',
geometry_field='geometry'))
import plotly.express as px
fig = px.choropleth_mapbox(
locs_df_g,
geojson=geojson_obj,
locations='h3_cell',
color='count',
color_continuous_scale="Viridis",
range_color=(0, locs_df_g['count'].mean()),
mapbox_style='carto-positron',
zoom=12,
center={"lat": locs_df.lat.mean(), "lon": locs_df.lon.mean()},
opacity=0.1,
labels={'count': 'count of data'})
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show("notebook")
interests_df
| marital_status_married | marital_status_not_married | employment_working | employment_not_working | availability_of_education_has_a_higher_education | availability_of_education_no_higher_education | interests_b2b_advertising_and_marketing | interests_b2b_raw_materials | interests_b2b_equipment_machines_energy_supply | interests_b2b_office | ... | interests_parents_of_toddlers | interests_parents_of_preschoolers | interests_of_parents_of_primary_school_students | interests_parents_of_middle_and_high_school_students | interests_business_education | age_17 | age_55 | gender_female | gender_male | id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 146343 |
| 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 149957 |
| 2 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 78692 |
| 3 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 99331 |
| 4 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 129854 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 108053 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 90108 |
| 108054 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 124168 |
| 108055 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 117301 |
| 108056 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 104867 |
| 108057 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 130525 |
108058 rows × 106 columns
ages_df = interests_df[[age for age in interests_df.columns if age.startswith("age_")]]
ages_df.apply(sum).to_frame().style.bar()
| 0 | |
|---|---|
| age_18_24 | 50297 |
| age_25_34 | 53941 |
| age_35_44 | 41632 |
| age_45_54 | 22903 |
| age_17 | 15314 |
| age_55 | 27033 |
gender_df = interests_df[[gender for gender in interests_df.columns if gender.startswith("gender_")]]
gender_df.apply(sum).to_frame().style.bar()
| 0 | |
|---|---|
| gender_female | 73533 |
| gender_male | 73437 |
employment_df = interests_df[[user for user in interests_df.columns if user.startswith("employment_")]]
employment_df.apply(sum).to_frame().style.bar()
| 0 | |
|---|---|
| employment_working | 78881 |
| employment_not_working | 26591 |
interests_df[[user for user in interests_df.columns if user.startswith("availability_of_education_")]].apply(sum).to_frame().style.bar()
| 0 | |
|---|---|
| availability_of_education_has_a_higher_education | 55189 |
| availability_of_education_no_higher_education | 64654 |
interests_df[[user for user in interests_df.columns if user.startswith("children_")]].apply(sum).to_frame().style.bar()
| 0 | |
|---|---|
| children_under_16_there_are_children_in_the_family | 71838 |
| children_under_16_no_children_in_the_family | 78563 |
interests_df[[user for user in interests_df.columns if user.startswith("marital_")]].apply(sum).to_frame().style.bar()
| 0 | |
|---|---|
| marital_status_married | 22707 |
| marital_status_not_married | 52332 |
interests_df[[user for user in interests_df.columns if "individual_income_" in user]].apply(sum).to_frame().style.bar()
| 0 | |
|---|---|
| individual_income_a_below_average_income | 5453 |
| individual_income_b_average_income | 11620 |
| individual_income_c_above_average_income | 5440 |
| individual_income_d_high_income | 2085 |
| individual_income_e_premium | 1262 |
interests_df[[user for user in interests_df.columns if "household_income_" in user]].apply(sum).to_frame().style.bar()
| 0 | |
|---|---|
| household_income_a_below_average | 3917 |
| household_income_b_average | 50697 |
| household_income_c_above_average | 86867 |
interests_df[[user for user in interests_df.columns if user.startswith("interests_")]].apply(sum).sort_values(ascending=False).to_frame().style.bar()
| 0 | |
|---|---|
| interests_banks_banking_services | 61024 |
| interests_new_buildings | 58974 |
| interests_car_owners | 55978 |
| interests_b2b_documentary_and_financial_and_legal_support | 52628 |
| interests_auto_premium_class | 51677 |
| interests_medium_and_large_business | 51580 |
| interests_renting_residential_property | 50192 |
| interests_b2b_it_for_business | 50041 |
| interests_auto_middle_class | 50041 |
| interests_auto_economy_class | 49551 |
| interests_b2b_raw_materials | 48303 |
| interests_resale_property | 48132 |
| interests_baby_products | 47634 |
| interests_interest_in_buying_a_new_car | 47208 |
| interests_contributions_and_deposits | 46209 |
| interests_mortgage | 45086 |
| interests_loans_for_business | 44978 |
| interests_credit_cards | 44713 |
| interests_interest_in_insurance | 44569 |
| interests_consumer_loans | 42902 |
| interests_interest_in_buying_a_new_premium_car | 42245 |
| interests_auto_insurance | 41112 |
| interests_moto | 40302 |
| interests_interest_in_buying_a_new_economy_class_car | 39955 |
| interests_interest_in_buying_a_mobile_phone | 39740 |
| interests_freight_and_commercial_vehicles | 39159 |
| interests_auto_parts_and_service | 38710 |
| interests_interest_in_buying_a_new_middle_class_car | 38656 |
| interests_all_about_children | 37114 |
| interests_overseas_real_estate | 36566 |
| interests_wedding | 35279 |
| interests_b2b_trade_equipment_and_goods_wholesale | 34546 |
| interests_used_cars | 34170 |
| interests_b2b_medical_equipment | 34113 |
| interests_baby_food | 32770 |
| interests_b2b_agriculture | 32546 |
| interests_mobile_devices | 30739 |
| interests_television_and_video_equipment | 29597 |
| interests_special_equipment | 29147 |
| interests_car_loans | 26554 |
| interests_cell_phones_and_headset | 25890 |
| interests_auto_suvs | 25263 |
| interests_tires_and_wheels | 24931 |
| interests_parents_of_toddlers | 23065 |
| interests_b2b_office | 20957 |
| interests_telecom_operators | 20718 |
| interests_quotes_stock_markets | 20256 |
| interests_laptops_and_netbooks | 19950 |
| interests_microloans | 19630 |
| interests_using_online_banking | 19531 |
| interests_pregnancy_and_childbirth | 19376 |
| interests_learning_languages | 18118 |
| interests_parents_of_middle_and_high_school_students | 17960 |
| interests_use_of_electronic_money | 17907 |
| interests_of_parents_of_primary_school_students | 16924 |
| interests_parents_of_newborns | 16849 |
| interests_b2b_equipment_machines_energy_supply | 16636 |
| interests_mobile_communications_and_internet_access | 16358 |
| interests_photo_and_video_cameras | 16136 |
| interests_houses_cottages_and_land_plots | 15382 |
| interests_education | 14821 |
| interests_small_business | 14804 |
| interests_audio_engineering | 14345 |
| interests_business_education | 13252 |
| interests_parents_of_preschoolers | 11619 |
| interests_internet_access | 11511 |
| interests_childrens_health | 11186 |
| interests_finance_and_accounting | 11114 |
| interests_commercial_real_estate | 9786 |
| interests_basic | 8983 |
| interests_legal_support | 8964 |
| interests_preschool | 8252 |
| interests_b2b_advertising_and_marketing | 6442 |
| interests_higher | 5035 |
| interests_auto_electronics_and_gps | 5010 |
| interests_tablets_and_ereaders | 3532 |
| interests_average | 2693 |
| interests_tvs | 2620 |
| interests_human_resources | 877 |
| interests_specialized_secondary | 875 |
| interests_active_mobile_internet_users | 859 |
h3_cell = "8911810832fffff"
h3_cell_from_center = "8911817240fffff"
ids = locs_df_g[locs_df_g["h3_cell"] == h3_cell]["ids"][0]
df = interests_df[[user for user in interests_df.columns if user.startswith("interests_") or "id" in user]]
df = df[df.id.isin(ids)]
df.drop("id", axis=1).apply(sum).sort_values(ascending=False).to_frame().style.bar()
| 0 | |
|---|---|
| interests_auto_parts_and_service | 3 |
| interests_new_buildings | 2 |
| interests_b2b_raw_materials | 2 |
| interests_banks_banking_services | 2 |
| interests_auto_suvs | 2 |
| interests_auto_economy_class | 2 |
| interests_auto_premium_class | 2 |
| interests_car_owners | 2 |
| interests_auto_middle_class | 2 |
| interests_interest_in_buying_a_new_economy_class_car | 2 |
| interests_interest_in_buying_a_new_car | 2 |
| interests_tires_and_wheels | 2 |
| individual_income_c_above_average_income | 1 |
| individual_income_b_average_income | 1 |
| interests_moto | 1 |
| interests_of_parents_of_primary_school_students | 1 |
| interests_mortgage | 1 |
| interests_car_loans | 1 |
| interests_cell_phones_and_headset | 1 |
| interests_auto_insurance | 1 |
| interests_parents_of_middle_and_high_school_students | 1 |
| interests_b2b_trade_equipment_and_goods_wholesale | 1 |
| interests_parents_of_newborns | 1 |
| interests_photo_and_video_cameras | 1 |
| interests_used_cars | 1 |
| interests_interest_in_buying_a_mobile_phone | 1 |
| interests_using_online_banking | 1 |
| interests_interest_in_buying_a_new_middle_class_car | 1 |
| interests_special_equipment | 1 |
| interests_freight_and_commercial_vehicles | 1 |
| interests_b2b_office | 1 |
| interests_b2b_it_for_business | 1 |
| individual_income_a_below_average_income | 0 |
| interests_credit_cards | 0 |
| interests_learning_languages | 0 |
| interests_education | 0 |
| interests_pregnancy_and_childbirth | 0 |
| interests_human_resources | 0 |
| interests_medium_and_large_business | 0 |
| interests_finance_and_accounting | 0 |
| interests_legal_support | 0 |
| interests_baby_food | 0 |
| individual_income_d_high_income | 0 |
| individual_income_e_premium | 0 |
| interests_wedding | 0 |
| interests_microloans | 0 |
| interests_small_business | 0 |
| interests_contributions_and_deposits | 0 |
| interests_commercial_real_estate | 0 |
| interests_parents_of_preschoolers | 0 |
| interests_parents_of_toddlers | 0 |
| interests_use_of_electronic_money | 0 |
| interests_mobile_communications_and_internet_access | 0 |
| interests_b2b_advertising_and_marketing | 0 |
| interests_higher | 0 |
| interests_specialized_secondary | 0 |
| interests_b2b_equipment_machines_energy_supply | 0 |
| interests_b2b_documentary_and_financial_and_legal_support | 0 |
| interests_b2b_medical_equipment | 0 |
| interests_childrens_health | 0 |
| interests_tvs | 0 |
| interests_telecom_operators | 0 |
| interests_internet_access | 0 |
| interests_renting_residential_property | 0 |
| interests_b2b_agriculture | 0 |
| interests_active_mobile_internet_users | 0 |
| interests_interest_in_buying_a_new_premium_car | 0 |
| interests_tablets_and_ereaders | 0 |
| interests_laptops_and_netbooks | 0 |
| interests_baby_products | 0 |
| interests_audio_engineering | 0 |
| interests_television_and_video_equipment | 0 |
| interests_auto_electronics_and_gps | 0 |
| interests_consumer_loans | 0 |
| interests_loans_for_business | 0 |
| interests_quotes_stock_markets | 0 |
| interests_interest_in_insurance | 0 |
| interests_houses_cottages_and_land_plots | 0 |
| interests_overseas_real_estate | 0 |
| interests_resale_property | 0 |
| interests_all_about_children | 0 |
| interests_mobile_devices | 0 |
| interests_preschool | 0 |
| interests_basic | 0 |
| interests_average | 0 |
| interests_business_education | 0 |
locs_df_g[locs_df_g["h3_cell"] == h3_cell]
| h3_cell | ids | count | geometry | |
|---|---|---|---|---|
| 0 | 8911810832fffff | [12410, 56052, 14802, 33036, 33036, 56052, 124... | 17 | POLYGON ((37.98722651938006 54.87444152107866,... |
df
| interests_b2b_advertising_and_marketing | interests_b2b_raw_materials | interests_b2b_equipment_machines_energy_supply | interests_b2b_office | interests_b2b_documentary_and_financial_and_legal_support | interests_b2b_medical_equipment | interests_b2b_trade_equipment_and_goods_wholesale | interests_childrens_health | interests_tvs | interests_telecom_operators | ... | interests_credit_cards | interests_freight_and_commercial_vehicles | interests_special_equipment | interests_parents_of_newborns | interests_parents_of_toddlers | interests_parents_of_preschoolers | interests_of_parents_of_primary_school_students | interests_parents_of_middle_and_high_school_students | interests_business_education | id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 18322 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1277 |
| 44439 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12410 |
| 67571 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 56052 |
| 105688 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 33036 |
4 rows × 87 columns